# ============================================================================
# Replication Script for Existing Surveys and Index 
# Gender Differences in Immigration Attitudes and Gender Economic Equality
# ISSP Data (1995, 2003, 2013 cumulative) & World Economic Forum (WEF) Index
# ============================================================================

# Load necessary libraries
library(tidyverse)  
library(haven)     
library(broom)      
library(ggrepel)    

# ============================================================================
# DATA LOADING AND PREPARATION
# ============================================================================

# Load ISSP data (1995, 2003, 2013 cumulative dataset)
# Source: International Social Survey Programme. 2020. "ISSP 1995/2003/2013 Cumulation - 
# 'National Identity I-III' – ZA No.5960/5961." 
# https://www.gesis.org/en/issp/data-and-documentation/national-identity/cumulation
df <- read_dta("ISSP_National Identity I-III - ISSP 1995-2003-2013\\ZA5960_v1-0-0.dta\\ZA5960_v1-0-0.dta")
names(df) <- tolower(names(df))

# Load WEF gender equality data
# Source: World Economic Forum. 2013. "Global Gender Gap Report 2013."
# https://www.weforum.org/publications/global-gender-gap-report-2013/
# Note: Data manually extracted from WEF 2013 report and compiled into CSV
wef <- read.csv("wef_data_from_report.csv")

# ============================================================================
# VARIABLE CREATION
# ============================================================================

# Create female indicator variable
df$female <- abs(df$sex - 1)
df$female[df$female > 1] <- NA

# Create immigration attitude variable (V48)
# Original scale: 1=Increased a lot, 2=Increased a little, 3=Remain same, 
#                4=Reduced a little, 5=Reduced a lot
# Reversed to: Higher values = more positive toward immigration
df$immi.pos <- df$v48
df$immi.pos[df$immi.pos < 0] <- NA  # Remove missing values
df$immi.pos <- abs(df$immi.pos - 6)  # Reverse scale

# Create partisan left variable (party_lr reversed to party_rl)
df$party_rl <- case_when(
  df$party_lr < 0 ~ NA_real_,
  df$party_lr == 5 ~ 1,  # Far right becomes 1 (far left)
  df$party_lr == 4 ~ 2,  # Right becomes 2 (left)
  df$party_lr == 3 ~ 3,  # Center stays 3 (center)
  df$party_lr == 2 ~ 4,  # Left becomes 4 (right)
  df$party_lr == 1 ~ 5,  # Far left becomes 5 (far right)
  TRUE ~ NA_real_
)

# ============================================================================
# DATA MERGING AND SUMMARY STATISTICS
# ============================================================================
# Merge ISSP and WEF data
df$country_no <- df$country
df_combined <- df %>% 
  left_join(wef, by = "country_no")

# Create country-level summary statistics 
df_summary <- df_combined %>%
  group_by(country_name) %>%
  do(data.frame(
    country_name = first(.$country_name),
    mean_men = mean(.$immi.pos[.$female == 0], na.rm = TRUE),
    mean_women = mean(.$immi.pos[.$female == 1], na.rm = TRUE),
    wef_eql_score = mean(.$wef13_economic.overall, na.rm = TRUE)
  )) %>%
  mutate(
    gender_diff = mean_men - mean_women,
    gender_diff_pct = gender_diff/8*100
  ) %>%
  arrange(desc(gender_diff_pct))

# Clean dataset (remove missing values)
df_summary_clean <- df_summary %>%
  filter(!is.na(wef_eql_score) & !is.na(gender_diff))

# ============================================================================
# STATISTICAL TESTING
# ============================================================================
library(broom)  

# Function to perform t-test for each country
perform_country_ttest <- function(data) {
  result <- t.test(data$immi.pos[data$female == 1], 
                   data$immi.pos[data$female == 0])
  tidy_result <- tidy(result)
  return(tidy_result)
}

# Perform t-tests by country
t_test_results <- df_combined %>%
  group_by(country_name) %>%
  do(perform_country_ttest(.)) %>%
  select(country_name, estimate1, estimate2, p.value) %>%
  mutate(significant = ifelse(p.value < 0.05, "Significant", "Non-Significant"))

# Merge t-test results with summary statistics
df_summary_clean <- df_summary_clean %>%
  left_join(t_test_results, by = "country_name")

# Create gender equality terciles
df_summary_clean <- df_summary_clean %>%
  mutate(gender_equality_tercile = case_when(
    wef_eql_score <= quantile(wef_eql_score, 1/3, na.rm = TRUE) ~ "Low Gender Equality",
    wef_eql_score <= quantile(wef_eql_score, 2/3, na.rm = TRUE) ~ "Medium Gender Equality",
    TRUE ~ "High Gender Equality"
  ))


# ============================================================================
# CATEGORIZATION BY GENDER EQUALITY LEVELS
# ============================================================================

# Split countries by mean gender economic equality score
mean_wef_score <- mean(df_summary_clean$wef_eql_score, na.rm = TRUE)

countries_below_mean <- df_summary_clean %>%
  filter(wef_eql_score < mean_wef_score)

countries_above_mean <- df_summary_clean %>%
  filter(wef_eql_score > mean_wef_score)

# Create tercile categories for combined analysis
terciles <- quantile(df_summary_clean$wef_eql_score, probs = c(0, 1/3, 2/3, 1), na.rm = TRUE)

df_summary_clean <- df_summary_clean %>%
  mutate(
    gender_equality_tercile = case_when(
      wef_eql_score <= terciles[2] ~ "Low Gender Equality",
      wef_eql_score <= terciles[3] ~ "Medium Gender Equality",
      wef_eql_score <= terciles[4] ~ "High Gender Equality",
      TRUE ~ NA_character_
    )
  )

# ============================================================================
# MAIN PLOTS
# ============================================================================

# Set consistent axis limits for comparability
x_range <- range(c(countries_above_mean$mean_men, countries_below_mean$mean_men), na.rm = TRUE)
y_range <- range(c(countries_above_mean$mean_women, countries_below_mean$mean_women), na.rm = TRUE)
x_limits <- c(x_range[1] - 0.05, x_range[2] + 0.05)
y_limits <- c(y_range[1] - 0.05, y_range[2] + 0.05)

# Figure 1a: Countries with higher gender equality 
# ("Figure 1a. Immigration Preferences in Countries with Above-Average Gender Equality")
figure_1a <- ggplot(countries_above_mean, aes(x = mean_men, y = mean_women, label = country_name)) +
  geom_point(aes(color = gender_diff > 0, shape = significant), size = 4) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
  geom_text_repel(
    size = 5, min.segment.length = 0, 
    box.padding = 0.3, max.overlaps = Inf, 
    nudge_x = .01, nudge_y = .01, color = "grey50"
  ) +
  labs(
    x = "Men's Positive Immigration Attitudes", 
    y = "Women's Positive Immigration Attitudes"
  ) +
  scale_color_discrete(
    name = "Gender Difference", 
    labels = c("Women > Men", "Men > Women")
  ) + 
  scale_shape_manual(
    values = c(16, 17),
    name = "Statistical Significance", 
    labels = c("Non-Significant", "Significant")
  ) +
  scale_x_continuous(limits = x_limits) +
  scale_y_continuous(limits = y_limits) +
  theme_bw() +
  theme(
    legend.position = "right", 
    legend.text = element_text(size = 12),
    legend.title = element_text(size = 14),
    axis.title = element_text(size = 14),
    axis.text = element_text(size = 12)
  )
print(figure_1a)

# Figure 1b: Countries with lower gender equality 
# ("Figure 1b. "Immigration Preferences in Countries with Below-Average Gender Equality")
figure_1b <- ggplot(countries_below_mean, aes(x = mean_men, y = mean_women, label = country_name)) +
  geom_point(aes(color = gender_diff > 0, shape = significant), size = 4) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
  geom_text_repel(
    size = 5, min.segment.length = 0, 
    box.padding = 0.3, max.overlaps = Inf, 
    nudge_x = .01, nudge_y = .01, color = "grey50"
  ) +
  labs(
    x = "Men's Positive Immigration Attitudes", 
    y = "Women's Positive Immigration Attitudes"
  ) +
  scale_color_discrete(
    name = "Gender Difference", 
    labels = c("Women > Men", "Men > Women")
  ) + 
  scale_shape_manual(
    values = c(16, 17),
    name = "Statistical Significance", 
    labels = c("Non-Significant", "Significant")
  ) +
  scale_x_continuous(limits = x_limits) +
  scale_y_continuous(limits = y_limits) +
  theme_bw() +
  theme(
    legend.position = "right", 
    legend.text = element_text(size = 12),
    legend.title = element_text(size = 14),
    axis.title = element_text(size = 14),
    axis.text = element_text(size = 12)
  )
print(figure_1b)

# Figure 1c: Combined analysis with tercile regression lines 
# ("Figure 1c. Immigration Preferences by Economic Gender Equality Terciles")
figure_1c <- ggplot(df_summary_clean, aes(x = mean_men, y = mean_women)) +
  geom_smooth(
    aes(color = gender_equality_tercile), 
    method = "lm", se = FALSE, size = 1.5
  ) +
  geom_point(
    aes(color = gender_equality_tercile, shape = significant), 
    size = 4, alpha = 0.8
  ) +
  geom_abline(
    slope = 1, intercept = 0, 
    linetype = "dashed", color = "black", size = 1
  ) +
  geom_text_repel(
    aes(label = country_name), 
    size = 4, min.segment.length = 0, 
    box.padding = 0.3, max.overlaps = Inf, 
    nudge_x = .01, nudge_y = .01, 
    color = "grey30"
  ) +
  scale_color_manual(
    values = c(
      "Low Gender Equality" = "#1b9e77",
      "Medium Gender Equality" = "#d95f02", 
      "High Gender Equality" = "#7570b3"
    ),
    name = "Gender Equality Level"
  ) +
  scale_shape_manual(
    values = c("Non-Significant" = 16, "Significant" = 17),
    name = "Statistical Significance"
  ) +
  labs(
    x = "Men's Positive Immigration Attitudes", 
    y = "Women's Positive Immigration Attitudes"
  ) +
  theme_bw() +
  theme(
    legend.position = "right", 
    legend.text = element_text(size = 12),
    legend.title = element_text(size = 14),
    axis.title = element_text(size = 14),
    axis.text = element_text(size = 12)
  )
print(figure_1c)

# ============================================================================
# APPENDIX PLOTS (SPIKE PLOTS)
# ============================================================================
# Create spike data for appendix plots
spike_data <- df_combined %>%
  group_by(country_name) %>%
  summarise(
    # Immigration gender gap (women - men)
    immigration_gap = mean(immi.pos[female == 1], na.rm = TRUE) - 
      mean(immi.pos[female == 0], na.rm = TRUE),
    
    # Labor force participation gap (WEF labor force participation ratio)
    labor_participation_gap = mean(wef13_labor, na.rm = TRUE),
    
    # Partisan left gender gap (women - men on party_rl scale)
    partisan_left_gap = mean(party_rl[female == 1], na.rm = TRUE) - 
      mean(party_rl[female == 0], na.rm = TRUE),
    
    # Gender equality score for x-axis
    gender_equality = mean(wef13_economic.overall, na.rm = TRUE),
    
    .groups = 'drop'
  ) %>%
  filter(!is.na(gender_equality) & !is.na(immigration_gap))


# Set consistent y-limits for spike plots
y_range_spike <- range(spike_data$immigration_gap, na.rm = TRUE)
y_limits_spike <- c(y_range_spike[1] - 0.05, y_range_spike[2] + 0.05)

# Common theme for spike plots
spike_theme <- theme_minimal() +
  theme(
    plot.title = element_text(size = 13, face = "bold", margin = margin(b = 15)),
    axis.title = element_text(size = 11),
    axis.text = element_text(size = 10),
    plot.caption = element_text(size = 9, color = "grey40"),
    panel.grid.minor = element_blank(),
    plot.margin = margin(20, 20, 20, 20)
  )

# Appendix Plot A1: Gender Gap in Immigration Preferences vs. Gender Economic Equality Score
figure_a1 <- ggplot(spike_data, aes(x = gender_equality, y = immigration_gap)) +
  geom_segment(aes(xend = gender_equality, yend = 0), 
               color = "steelblue", size = 1, alpha = 0.7) +
  geom_point(color = "steelblue", size = 2.5, alpha = 0.8) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red", size = 0.6) +
  geom_text_repel(aes(label = country_name), size = 3.2, box.padding = 0.3) +
  labs(
    title = "A1. Gender Gap in Immigration Preferences vs. Gender Economic Equality Score\n(Data from WEF 2013 & ISSP 1995/2003/2013 Cumulation)",
    x = "WEF Gender Economic Equality Score (Economic Participation and Opportunity Index)",
    y = "Immigration Gender Gap (Women - Men)") +
  spike_theme +
  scale_y_continuous(limits = y_limits_spike)
print(figure_a1)

# Appendix Plot A2: Gender Gap in Immigration Preferences vs. Labor Force Participation
figure_a2 <- ggplot(spike_data, aes(x = labor_participation_gap, y = immigration_gap)) +
  geom_segment(aes(xend = labor_participation_gap, yend = 0), 
               color = "darkgreen", size = 1, alpha = 0.7) +
  geom_point(color = "darkgreen", size = 2.5, alpha = 0.8) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red", size = 0.6) +
  geom_text_repel(aes(label = country_name), size = 3.2, box.padding = 0.3) +
  labs(
    title = "A2. Gender Gap in Immigration Preferences vs. Labor Force Participation\n(Data from WEF 2013 & ISSP 1995/2003/2013 Cumulation)",
    x = "WEF Labor Force Participation Ratio (Women/Men)",
    y = "Immigration Gender Gap (Women - Men)") +
  spike_theme +
  scale_y_continuous(limits = y_limits_spike)
print(figure_a2)

# Appendix Plot A3: Gender Gap in Immigration Preferences vs. Partisanship
# Create spike data for appendix plots with better NA handling
spike_data <- df_combined %>%
  group_by(country_name) %>%
  summarise(
    # Immigration gender gap (women - men)
    immigration_gap = mean(immi.pos[female == 1], na.rm = TRUE) - 
      mean(immi.pos[female == 0], na.rm = TRUE),
    
    # Labor force participation gap
    labor_participation_gap = mean(wef13_labor, na.rm = TRUE),
    
    # Partisan left gender gap (women - men)
    partisan_left_gap = mean(party_rl[female == 1], na.rm = TRUE) - 
      mean(party_rl[female == 0], na.rm = TRUE),
    
    # Gender equality score
    gender_equality = mean(wef13_economic.overall, na.rm = TRUE),
    
    .groups = 'drop'
  ) %>%
  filter(!is.na(gender_equality) & !is.na(immigration_gap))

# Create filtered data for plot A3 that removes countries with missing partisan data
spike_data_partisan <- spike_data %>%
  filter(!is.na(partisan_left_gap) & is.finite(partisan_left_gap))

# Plotting
figure_a3 <- ggplot(spike_data_partisan, aes(x = partisan_left_gap, y = immigration_gap)) +
  geom_segment(aes(xend = partisan_left_gap, yend = 0), 
               color = "purple", linewidth = 1, alpha = 0.7) +
  geom_point(color = "purple", size = 2.5, alpha = 0.8) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red", linewidth = 0.6) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "red", linewidth = 0.6) +
  geom_text_repel(aes(label = country_name), size = 3.2, box.padding = 0.3) +
  labs(
    title = "A3. Gender Gap in Immigration Preferences vs. Partisanship\n(Data from WEF 2013 & ISSP 1995/2003/2013 Cumulation)",
    x = "ISSP Partisan Left Gender Gap (Women - Men)",
    y = "Immigration Gender Gap (Women - Men)") +
  spike_theme +
  scale_y_continuous(limits = y_limits_spike)
print(figure_a3)


